This is a reproducibility report for “De novo discovery of traits co-occurring with chronic obstructive pulmonary disease” study.
Python (version 3.6.9), R (version 4.0.2) and RStudio (version 1.2.5033) were used for data processing, analysis and visualisation.
First, we run CoDeS3D pipeline to get COPD-associated spatial
regulatory interactions across lung tissue (using GTEx lung eQTL data
and Hi-C libraries for primary lung cells):
python codes3d/codes3d.py -s data/snps/263_copd_snps_5E-8.txt -o results/codes3d/lung -n Lung_cells_Schmitt2016 -t Lung.
### Getting number of SNPs vs eQTLs in lung across the entire genome
lung.map.genome <- data.frame(
snps = rep(c("eQTL","non-eQTL")),
number = c(740028, (43174097-740028)),
percentage = c(round((740028/43174097)*100, 2),
round(((43174097-740028)/43174097)*100, 2)))
#pdf("figures/lung_eqtl_map/lung_map_eqtls_vs_non-eqtls_genomewide.pdf", width = 9, height = 9)
ggplot(lung.map.genome, aes(x = "", y = percentage, fill = snps)) +
geom_bar(width = 1, stat = "identity", fill = c("#009444", "darkgrey"), color = "white") +
coord_polar("y", start = 0)+
#geom_text(aes(y = percentage, label = snps), color = "white")+
#scale_fill_manual(values = mycols) +
theme_void()
#dev.off()
### Getting number of eGenes
lung.map.genes <- data.frame(
genes = rep(c("eGenes","non-eGenes")),
number = c(15855, (56200-15855)),
percentage = c(round((15855/56200)*100, 2),
round(((56200-15855)/56200)*100, 2)))
#pdf("figures/lung_eqtl_map/lung_map_egeness_vs_non-egenes_genomewide.pdf", width = 9, height = 9)
ggplot(lung.map.genes, aes(x = "", y = percentage, fill = genes)) +
geom_bar(width = 1, stat = "identity", fill = c("#009444", "darkgrey"), color = "white") +
coord_polar("y", start = 0)+
#geom_text(aes(y = percentage, label = snps), color = "white")+
#scale_fill_manual(values = mycols) +
theme_void()
#dev.off()
### reading lung-specific GRN
lung_map <- vroom(file="./results/lung_specific_GRN/lung_grn_significant_eqtls.txt.gz",
delim="\t")
lung_map.df <- distinct(lung_map[,c("snp", "gene", "interaction_type")])
## Warning: One or more parsing issues, see `problems()` for details
lung_map_cis <- subset(lung_map.df, interaction_type=='Cis') # 840,170
#length(unique(lung_map_cis$snp)) # 711,397 eQTL SNPs
#length(unique(lung_map_cis$gene)) # 14,700 eGenes
lung_map_intra <- subset(lung_map.df, interaction_type=='Trans-intrachromosomal') # 16,213
#length(unique(lung_map_intra$snp)) # 16,175 eQTL SNPs
#length(unique(lung_map_intra$gene)) # 2,702 eGenes
lung_map_inter <- subset(lung_map.df, interaction_type=='Trans-interchromosomal') # 6,540
#length(unique(lung_map_inter$snp)) # 6,385 eQTL SNPs
#length(unique(lung_map_inter$gene)) # 2,003 eGenes
lung_map_cis.freq <- count(distinct(lung_map_cis[,c("snp", "gene")]), gene) %>%
mutate("interaction_type"="Cis")
lung_map_intra.freq <- count(distinct(lung_map_intra[,c("snp", "gene")]), gene) %>%
mutate("interaction_type"="Trans-intrachromosomal")
lung_map_inter.freq <- count(distinct(lung_map_inter[,c("snp", "gene")]), gene) %>%
mutate("interaction_type"="Trans-interchromosomal")
lung_map_combined <- rbind(lung_map_cis.freq, lung_map_intra.freq, lung_map_inter.freq)
lung_map_combined <- lung_map_combined %>%
mutate(interaction_type=factor(interaction_type,
levels=c("Cis",
"Trans-intrachromosomal",
"Trans-interchromosomal")))
# computing mean
lung_map_combined %>% group_by(interaction_type) %>% summarise(mean = mean(log10(n)))
## # A tibble: 3 × 2
## interaction_type mean
## <fct> <dbl>
## 1 Cis 1.30
## 2 Trans-intrachromosomal 0.466
## 3 Trans-interchromosomal 0.259
lung_map_combined %>% group_by(interaction_type) %>% summarise(mean = mean(n))
## # A tibble: 3 × 2
## interaction_type mean
## <fct> <dbl>
## 1 Cis 57.2
## 2 Trans-intrachromosomal 6.00
## 3 Trans-interchromosomal 3.27
# computing SD
lung_map_combined %>% group_by(interaction_type) %>% summarise(sd=sd(log10(n)))
## # A tibble: 3 × 2
## interaction_type sd
## <fct> <dbl>
## 1 Cis 0.707
## 2 Trans-intrachromosomal 0.471
## 3 Trans-interchromosomal 0.386
lung_map_combined %>% group_by(interaction_type) %>% summarise(sd=sd(n))
## # A tibble: 3 × 2
## interaction_type sd
## <fct> <dbl>
## 1 Cis 93.2
## 2 Trans-intrachromosomal 10.5
## 3 Trans-interchromosomal 6.09
# comparing number of eQTLs per gene grouped by interaction type
eqtls_per_gene <- list(c("Cis", "Trans-intrachromosomal"),
c("Cis", "Trans-interchromosomal"),
c("Trans-interchromosomal", "Trans-intrachromosomal"))
#pdf("figures/lung_eqtl_map/violin_eqtls_per_gene.pdf", width = 12, height = 8)
violin_eqtls_per_gene <- ggplot(lung_map_combined,
aes(x=interaction_type, y=log10(n), fill=interaction_type)) +
geom_violin(trim=T) +
stat_summary(fun = "median", geom = "point", shape = 3, size = 4, color = "white") +
stat_summary(fun.data=mean_sdl, fun.args = list(mult = 1),
geom="pointrange", color="red", shape=3, size=0.85) +
scale_x_discrete("Interaction type") +
scale_y_continuous("log10(Number of eQTL SNPs)", expand = c(0,0)) +
#scale_fill_manual(values=c("lightgrey", "grey", "darkgrey")) +
scale_fill_manual(values=c("#009444", "#009444", "#009444")) +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_blank(),
legend.text=element_text(size=20),
legend.title=element_blank(),
legend.position = "none",
#legend.direction = "horizontal",
axis.text=element_text(size=22, colour = "black"),
axis.title=element_text(size=24, colour = "black"),
strip.text.x = element_text(size = 19, colour = "black")) +
#theme(axis.text.x= element_text(size=12)) +
stat_compare_means(comparisons = eqtls_per_gene, method = "t.test", label = "p.signif")
violin_eqtls_per_gene
#dev.off()
### reading GTEx lung v8
lung_tpm <- vroom(file="./data/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_median_tpm.gct.gz", delim = "\t", skip=2, col_select = c("Description", "Lung", "gencode_id"= Name))
lung_map <- vroom(file="./results/lung_specific_GRN/lung_grn_significant_eqtls.txt.gz",
delim="\t")
lung_map.df <- distinct(lung_map[,c("snp", "gencode_id", "interaction_type")])
## Warning: One or more parsing issues, see `problems()` for details
lung_map_cis <- subset(lung_map.df, interaction_type=='Cis')
lung_map_intra <- subset(lung_map.df, interaction_type=='Trans-intrachromosomal')
lung_map_inter <- subset(lung_map.df, interaction_type=='Trans-interchromosomal')
lung_map_cis.freq <- count(distinct(lung_map_cis[,c("snp", "gencode_id")]), gencode_id) %>%
mutate("interaction_type"="Cis")
lung_map_intra.freq <- count(distinct(lung_map_intra[,c("snp", "gencode_id")]), gencode_id) %>%
mutate("interaction_type"="Trans-intrachromosomal")
lung_map_inter.freq <- count(distinct(lung_map_inter[,c("snp", "gencode_id")]), gencode_id) %>%
mutate("interaction_type"="Trans-interchromosomal")
lung_map_combined <- rbind(lung_map_cis.freq, lung_map_intra.freq, lung_map_inter.freq)
lung_map_combined <- lung_map_combined %>%
mutate(interaction_type=factor(interaction_type,
levels=c("Cis",
"Trans-intrachromosomal",
"Trans-interchromosomal")))
lung_map_combined_tpm <- merge(lung_map_combined, lung_tpm, by= "gencode_id")
lung_map_combined_tpm.df <- lung_map_combined_tpm %>%
relocate("gene"=Description, gencode_id, "tpm"=Lung, interaction_type) %>%
mutate(interaction_type=factor(interaction_type,
levels=c("Cis",
"Trans-intrachromosomal",
"Trans-interchromosomal")))
# computing mean
lung_map_combined_tpm.df %>% group_by(interaction_type) %>% summarise(mean = mean(log10(tpm+1)))
## # A tibble: 3 × 2
## interaction_type mean
## <fct> <dbl>
## 1 Cis 0.908
## 2 Trans-intrachromosomal 0.870
## 3 Trans-interchromosomal 0.857
lung_map_combined_tpm.df %>% group_by(interaction_type) %>% summarise(mean = mean(tpm))
## # A tibble: 3 × 2
## interaction_type mean
## <fct> <dbl>
## 1 Cis 26.7
## 2 Trans-intrachromosomal 18.6
## 3 Trans-interchromosomal 17.3
# computing SD
lung_map_combined_tpm.df %>% group_by(interaction_type) %>% summarise(sd=sd(log10(tpm+1)))
## # A tibble: 3 × 2
## interaction_type sd
## <fct> <dbl>
## 1 Cis 0.637
## 2 Trans-intrachromosomal 0.586
## 3 Trans-interchromosomal 0.592
lung_map_combined_tpm.df %>% group_by(interaction_type) %>% summarise(sd=sd(tpm))
## # A tibble: 3 × 2
## interaction_type sd
## <fct> <dbl>
## 1 Cis 102.
## 2 Trans-intrachromosomal 56.2
## 3 Trans-interchromosomal 40.6
# comparing number of eQTLs per gene grouped by interaction type
tpm_expression <- list(c("Cis", "Trans-intrachromosomal"),
c("Cis", "Trans-interchromosomal"),
c("Trans-interchromosomal", "Trans-intrachromosomal"))
#pdf("figures/lung_eqtl_map/violin_expression.pdf", width = 12, height = 8)
violin_expression <- ggplot(lung_map_combined_tpm.df,
aes(x=interaction_type, y=log10(tpm+1), fill=interaction_type)) +
geom_violin(trim=T) +
stat_summary(fun = "median", geom = "point", shape = 3, size = 4, color = "white") +
stat_summary(fun.data=mean_sdl, fun.args = list(mult = 1),
geom="pointrange", color="red", shape=3, size=0.85) +
scale_x_discrete("Interaction type") +
scale_y_continuous("log10(TPM+1)", expand = c(0,0)) +
scale_fill_manual(values=c("#009444", "#009444", "#009444")) +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_blank(),
legend.text=element_text(size=20),
legend.title=element_blank(),
legend.position = "none",
#legend.direction = "horizontal",
axis.text=element_text(size=22, colour = "black"),
axis.title=element_text(size=24, colour = "black"),
strip.text.x = element_text(size = 19, colour = "black")) +
#theme(axis.text.x= element_text(size=12)) +
stat_compare_means(comparisons = tpm_expression, method = "t.test", label = "p.signif")
violin_expression
#dev.off()
Correlation analysis of the number of cis-, trans-intrachromosomal and trans-interchromosomal eQTL-eGene interactions and all variants (genotyped from lung samples obtained from GTEx v8) across different chromosomes in lung-specific regulatory map. The file “GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table_short_version.txt.gz” is available by request only.
# getting number of SNPs per chromosome
gtex_snps <- vroom(file="./data/GTEx_Analysis_2017-06-05_v8_WholeGenomeSeq_838Indiv_Analysis_Freeze.lookup_table_short_version.txt.gz", delim ="\t")
lung_map_chr1 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr1.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr2 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr2.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr3 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr3.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr4 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr4.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr5 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr5.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr6 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr6.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr7 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr7.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr8 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr8.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr9 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr9.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr10 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr10.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr11 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr11.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr12 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr12.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr13 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr13.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr14 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr14.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr15 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr15.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr16 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr16.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr17 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr17.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr18 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr18.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr19 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr19.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr20 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr20.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr21 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr21.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chr22 <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chr22.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map_chrX <- vroom(file="./results/lung_specific_GRN/significant_eqtls_chrX.txt.gz", delim="\t", col_select = c(snp, snp_chr))
lung_map <- rbind(lung_map_chr1, lung_map_chr2, lung_map_chr3, lung_map_chr4, lung_map_chr5,
lung_map_chr6, lung_map_chr7, lung_map_chr8, lung_map_chr9, lung_map_chr10,
lung_map_chr11, lung_map_chr12, lung_map_chr13, lung_map_chr14,
lung_map_chr15, lung_map_chr16, lung_map_chr17, lung_map_chr18,
lung_map_chr19, lung_map_chr20, lung_map_chr21, lung_map_chr22, lung_map_chrX)
# counting the number of SNPs per chromosome (GTEx and lung-specific GRN)
gtex_snps_count <- gtex_snps %>% count(chr)
lung_map_eqtls_count <- lung_map %>% count(snp_chr)
# calculating gene density per Mb for each chromosome
genomeSum <- read.csv("./data/genomeSummary_grch38p13.txt", sep="\t", skip=1)
genomeSum <- genomeSum %>%
mutate(geneDensity = genomeSum$Gene/genomeSum$Size..Mb.) %>% #compute gene density
mutate(Chromosome = tolower(Type)) %>% dplyr::select(geneDensity, Chromosome)
genomeSum[23,2] <- "chrX"; genomeSum[24,2] <- "chrY"; genomeSum[25,2] <- "chrMT"
# merging the two count tables (GTEx and lung-specific GRN)
lung_map_count_merged <- merge(lung_map_eqtls_count, gtex_snps_count, by=1, all=TRUE) %>%
dplyr::rename("Chromosome"=snp_chr, "eQTL_eGene_interactions"=n.x, "snps_number"=n.y) %>%
merge(genomeSum, by="Chromosome")
#pdf("figures/lung_eqtl_map/cor_lung_map_cis_per_chromosome.pdf", width = 13, height = 6)
options(scipen = 6)
corr_gg <- ggplot(lung_map_count_merged, aes(x = snps_number, y = eQTL_eGene_interactions)) +
geom_smooth(fullrange = TRUE, method = lm, alpha = 0.2, level = 0.95, color = "red",
fill = "#009444") +
geom_count(aes(size = geneDensity)) +
geom_text_repel(aes(label = Chromosome),
min.segment.length = 0,
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50',
max.overlaps = 30) +
stat_cor(method = "pearson", label.x = 1000, label.y = 90000, size = 7) +
scale_x_continuous("Number of SNPs", expand = c(0, 0), limits = c(0, NA)) +
# the limits argument is set to -100000 so that the CI band can extend to the end of the plot
# setting limits to -big number ensures the CI band extends to ends of plot along with coord_cartesian
scale_y_continuous("Number of eQTL-eGene interactions", expand = c(0, 0),
limits = c(-1000000, 100000)) +
#labs(title = "Correlation between the number of eQTLs from lung-specific GRN and the number of SNPs from GTEx", hjust = 1) +
coord_cartesian(xlim = c(0, NA), ylim = c(0, NA)) +
guides(size = guide_legend("Genes/Mb")) +
scale_size_continuous(limits = c(10, 60)) +
theme_classic() +
theme(plot.title = element_blank(),
#axis.title.x = element_blank(),
legend.text=element_text(size=16),
legend.title=element_text(size=18),
#legend.position = "none",
#legend.direction = "horizontal",
axis.text=element_text(size=16, colour = "black"),
axis.title=element_text(size=18, colour = "black"))
corr_gg
#dev.off()
### Getting number of COPD-associated GWAS snps
snps <- readLines("data/snps/263_COPD_snps_5E-8.txt") # 263
### Reading significant spatial interactions in lung
sp_lung <- read.table(gzfile("results/codes3d/lung/significant_eqtls.txt.gz"),
header = TRUE, sep = "\t") # 151
### Getting spatial eQTLs in lung
sp_lung_eqtls <- unique(sp_lung$snp) # 103
#writeLines(sp_lung_eqtls, con = "results/codes3d/lung/copd_lung_eqtl_snps.txt", sep = "\n")
### Getting spatially regulated genes in lung
sp_lung_egenes <- unique(sp_lung$gene) # 107
#writeLines(sp_lung_egenes, con = "results/codes3d/lung/copd_lung_egenes.txt", sep = "\n")
# getting eqtls vs non-eqtls in lung
copd.snp.lung <- data.frame(
snps = c("eQTL","non-eQTL"),
number = c(length(sp_lung_eqtls), length(snps)-length(sp_lung_eqtls)),
percentage = c(round(length(sp_lung_eqtls)/length(snps)*100, 2),
round((length(snps)-length(sp_lung_eqtls))/length(snps)*100, 2)))
#pdf("figures/functional_annotation/lung_eqtls_vs_non-eqtls.pdf", width = 8, height = 9)
ggplot(copd.snp.lung, aes(x = factor(snps, level=c("non-eQTL","eQTL")), y = percentage),
fill = snps) +
geom_bar(stat="identity", position = "dodge") +
scale_fill_manual(values=c("#ED1C24", "grey")) +
scale_y_continuous("Percentage", expand = c(0,0), limits = c(0,65)) +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_blank(),
legend.text=element_text(size=20),
legend.title=element_blank(),
legend.position = "none",
#legend.direction = "horizontal",
axis.text=element_text(size=20, colour = "black"),
axis.title=element_text(size=28, colour = "black"),
strip.text.x = element_text(size = 19, colour = "black")) +
geom_text(aes(y = percentage, label = paste0("n=", number)),
position=position_dodge(width=0.9), vjust=-0.25, color = "black",
size = 8, fontface = 'italic')
#dev.off()
We used wANNOVAR tool to
obtain information about the locus eQTLs tagged (9 June 2021). First, we
extracted the funnctional annotation for COPD-associated eQTLs:
cut -f1,2,3,6,132 results/wannovar/query.output.genome_summary.txt | sort -u > results/wannovar/copd_fun_snp_ann.txt
# calculating percentage of functionally annotated SNPs
cal_ann <- function(phe, total) {
df <- data.frame(matrix(ncol=3, nrow=0))
colnames(df)<- c("annotation","number", "percent")
for (i in 1:length(phe)){
down <- nrow(phe[which(phe$Func.refGene=="downstream"), ])
df[nrow(df) + 1,] = list(annotation = "downstream", number = down,
percent = down/total*100)
ex <- nrow(phe[which(phe$Func.refGene=="exonic"), ])
df[nrow(df) + 1,] = list(annotation = "exonic", number = ex,
percent = ex/total*100)
inter <- nrow(phe[which(phe$Func.refGene=="intergenic"), ])
df[nrow(df) + 1,] = list(annotation = "intergenic", number = inter,
percent = inter/total*100)
intro <- nrow(phe[which(phe$Func.refGene=="intronic"), ])
df[nrow(df) + 1,] = list(annotation = "intronic", number = intro,
percent = intro/total*100)
ncRNA_ex <- nrow(phe[which(phe$Func.refGene=="ncRNA_exonic"), ])
df[nrow(df) + 1,] = list(annotation = "ncRNA_exonic", number = ncRNA_ex,
percent = ncRNA_ex/total*100)
ncRNA_in <- nrow(phe[which(phe$Func.refGene=="ncRNA_intronic"), ])
df[nrow(df) + 1,] = list(annotation = "ncRNA_intronic", number = ncRNA_in,
percent = ncRNA_in/total*100)
up <- nrow(phe[which(phe$Func.refGene=="upstream"), ])
df[nrow(df) + 1,] = list(annotation = "upstream", number = up,
percent = up/total*100)
UTR3 <- nrow(phe[which(phe$Func.refGene=="UTR3"), ])
df[nrow(df) + 1,] = list(annotation = "UTR3", number = UTR3,
percent = UTR3/total*100)
}
df[!duplicated(df), ]
}
copd_ann <- read.table("results/wannovar/copd_fun_snp_ann.txt", sep = "\t", header=FALSE)
copd_ann <- copd_ann[order(copd_ann$V1, decreasing = TRUE),]
colnames(copd_ann) <- copd_ann[1, ]; copd_ann <- copd_ann[-1, ]
copd_ann_lung <- subset(copd_ann, copd_ann$Otherinfo %in% sp_lung_eqtls) # 103
copd_fun_ann_lung <- cal_ann(copd_ann_lung, length(sp_lung_eqtls))
# lung eQTLs
l_number <- c(copd_fun_ann_lung[copd_fun_ann_lung$annotation=="downstream",][,2],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="exonic",][,2],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="intergenic",][,2],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="intronic",][,2],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="ncRNA_intronic",][,2],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="ncRNA_exonic",][,2],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="UTR3",][,2])
l_percent <- c(copd_fun_ann_lung[copd_fun_ann_lung$annotation=="downstream",][,3],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="exonic",][,3],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="intergenic",][,3],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="intronic",][,3],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="ncRNA_intronic",][,3],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="ncRNA_exonic",][,3],
copd_fun_ann_lung[copd_fun_ann_lung$annotation=="UTR3",][,3])
lung_eqtls.df <- data.frame(annotation = c("downstream", "exonic", "intergenic", "intronic",
"ncRNA_intronic", "ncRNA_exonic", "UTR3"),
number = l_number,
percent = l_percent)
#pdf("figures/functional_annotation/lung_copd_fun_ann.pdf", width = 8, height = 5)
ggplot(lung_eqtls.df, aes(x = annotation, y = percent, fill = "#009444")) +
geom_bar(stat="identity", position = "dodge") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_blank(),
legend.position = "none",
axis.text=element_text(size=14, colour = "black"),
axis.text.x=element_text(vjust = 0.5, hjust=0.75),
axis.title=element_text(size=14, colour = "black")) +
scale_fill_manual(values = "#009444") +
scale_y_continuous("Functional annotation", expand = c(0,0), limits = c(0,75)) +
geom_text(aes(y = percent, label = paste0(round(percent, digits=2), "%")),
position=position_dodge(width=0.9), vjust=0.75, hjust=-0.15,
color = "black", size = 3.5) +
#scale_y_continuous(limits = c(0, 70)) +
xlab("Percentage") +
coord_flip()
#dev.off()
# getting the number of eGenes per eQTL SNP
copd.snps.x <- distinct(sp_lung[,c("snp", "gene")])
copd.snps.freq <- count(distinct(sp_lung[,c("snp", "gene")]), snp) # 103 eQTL SNPs
copd.snps.freq <- data.frame(table(copd.snps.freq$n))
colnames(copd.snps.freq) <- c("genes", "snps")
#copd.snps.freq$genes <- as.numeric(as.character(copd.snps.freq$genes))
#pdf("figures/functional_annotation/lung_copd_genes_per_eqtl.pdf", width = 5, height = 9)
ggplot(copd.snps.freq, aes(x = factor(snps, level=c("2","8", "26", "67")), y = genes)) +
geom_bar(stat="identity", position = "dodge", fill = "#009444") +
scale_fill_manual(values=c("#009444")) +
scale_x_discrete("Number of eQTL SNPs") +
scale_y_discrete("Number of eGenes", expand = c(0,0)) +
#scale_y_continuous(limits=c(0, 7), expand=c(0,0)) +
theme_classic() +
theme(plot.title = element_blank(),
#axis.title.x = element_blank(),
legend.text=element_text(size=20),
legend.title=element_blank(),
legend.position = "none",
#legend.direction = "horizontal",
axis.text=element_text(size=20, colour = "black"),
axis.title=element_text(size=24, colour = "black"),
strip.text.x = element_text(size = 19, colour = "black")) +
geom_text(aes(y = genes, label = paste0("n=", genes)),
position=position_dodge(width=0.9), vjust=-0.25, color = "black",
size = 8, fontface = 'italic')
#labs(y = "Number of eGenes")
#dev.off()
# reading significant spatial interactions in lung
sp_lung <- read.table(gzfile("results/codes3d/lung/significant_eqtls.txt.gz"),
header = TRUE, sep = "\t") #151
# getting eqtls vs non-eqtls in lung
copd.snp.lung <- data.frame(
interactions = c("Cis","Trans-intra", "Trans-inter"),
number = c(148, 1, 2),
percentage = c(round(148/151*100, 2),
round(1/151*100, 2),
round(2/151*100, 2)))
#pdf("figures/functional_annotation/lung_copd_cis_vs_trans.pdf", width = 7, height = 9)
ggplot(copd.snp.lung, aes(x = factor(interactions,
level=c("Cis","Trans-intra", "Trans-inter")),
y = percentage)) +
geom_bar(stat="identity", position="dodge", fill="#009444") +
scale_fill_manual(values=c("#009444")) +
scale_x_discrete("Interaction type") +
scale_y_continuous("Percentage", expand = c(0,0), limits = c(0, 105)) +
theme_classic() +
theme(plot.title = element_blank(),
#axis.title.x = element_blank(),
legend.text=element_text(size=20),
legend.title=element_blank(),
legend.position = "none",
#legend.direction = "horizontal",
axis.text=element_text(size=20, colour = "black"),
axis.title=element_text(size=24, colour = "black"),
strip.text.x = element_text(size = 15, colour = "black")) +
geom_text(aes(y = percentage, label = paste0("n=", number)),
position=position_dodge(width=0.9), vjust=-0.25, color = "black",
size = 8, fontface = 'italic')
#dev.off()
# getting spatially regulated genes in lung
sp_lung_egenes <- unique(sp_lung$gene) # 107
# getting eqtls vs non-eqtls in lung
copd.gene.lung <- data.frame(
gene_types = c("Protein-coding","Non-coding RNA", "Pseudogene"),
number = c(84, 22, 1),
percentage = c(round(84/107*100, 2),
round(22/107*100, 2),
round(1/107*100, 2)))
#pdf("figures/functional_annotation/lung_copd_gene_types.pdf", width = 8, height = 9)
ggplot(copd.gene.lung, aes(x = factor(gene_types,
level=c("Protein-coding","Non-coding RNA", "Pseudogene")),
y = percentage)) +
geom_bar(stat="identity", position="dodge", fill="#009444") +
scale_fill_manual(values=c("#009444")) +
scale_y_continuous("Percentage", expand = c(0,0), limits = c(0, 85)) +
scale_x_discrete("Gene type") +
theme_classic() +
theme(plot.title = element_blank(),
#axis.title.x = element_blank(),
legend.text=element_text(size=20),
legend.title=element_blank(),
legend.position = "none",
#legend.direction = "horizontal",
axis.text=element_text(size=20, colour = "black"),
axis.title=element_text(size=24, colour = "black"),
strip.text.x = element_text(size = 15, colour = "black")) +
geom_text(aes(y = percentage, label = paste0("n=", number)),
position=position_dodge(width=0.9), vjust=-0.25, color = "black",
size = 8, fontface = 'italic')
#dev.off()
The STRING PPI network (version 11.5,
protein.links.full.v11.0.txt.gz) was downloaded from STRING
website. We extracted only human PPIs with a combined score ≥ 700
between proteins:
zgrep ^"9606\." protein.links.full.v11.5.txt.gz | awk '($16 >= 700) { print $1, $2, $16 }' > human_PPIs.full.v11.5_700.txt
Made the file tab separated:
tr ' ' '\t' < human_PPIs.full.v11.5_700.txt > tab_human_PPIs.full.v11.5_700.txt
And compressed it:
gzip tab_human_PPIs.full.v11.5_700.txt
# loading human PPI network
PPI_700 <- read.table(gzfile("results/string/tab_human_PPIs.full.v11.5_700.txt.gz"),
header = FALSE, sep="\t") # 505,968 PPIs
# removing all characters before and up to "."
PPI_700$V1 <- gsub(".*\\.", "", PPI_700$V1); PPI_700$V2 <- gsub(".*\\.", "", PPI_700$V2)
PPI_700 <- distinct(PPI_700) # 505,968 PPIs
#write.table(PPI_700, file = "results/string/PPI_700.txt", sep = "\t", col.names = FALSE, row.names=FALSE)
# extracting unique proteins
unique_proteins <- unique(c(PPI_700$V1, PPI_700$V2)) # 16,814
Overall, we extracted 505968 PPIs between a total of 16814 unique human proteins.
Ensembl protein identifiers were mapped to Ensembl gene identifiers.
# reading lung-specific gene regulatory network (GRN) data
lung_GRN_genes <- read.table(gzfile("results/lung_specific_GRN/lung_specific_GRN_genes.txt.gz")) # 15,855 genes
colnames(lung_GRN_genes) <- c("gene_id", "gene_name")
lung_GRN_genes <- unique(lung_GRN_genes)
# converting transcript IDs to gene IDs
lung_GRN_genes$gene_id <- gsub("\\..*","", lung_GRN_genes$gene_id)
# querying protein features
edb <- EnsDb.Hsapiens.v86
#hasProteinData(edb) # TRUE
#listTables(edb)
# getting protein information for gene IDs
lung_GRN_proteins <- genes(edb, filter = ~ gene_id %in% lung_GRN_genes$gene_id,
columns = c("protein_id", "gene_name"))
l_p_df <- bind_rows(lung_GRN_proteins@elementMetadata@listData) # 79,005
#write.table(l_p_df, file = "results/string/lung_GRN_proteins.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
# getting only protein and gene IDs and removing protein-gene pairs containing NAs
l_p_df <- l_p_df[, c("protein_id", "gene_name")]
l_p_df <- l_p_df[complete.cases(l_p_df), ] # 65,742 proteins
#write.table(l_p_df, file = "results/string/lung_GRN_gene_pairs.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
l_p_df <- read.table(gzfile("results/string/lung_GRN_gene_pairs.txt.gz"), header = TRUE,
sep="\t", colClasses = "character")
Lung-specific PPIN (LSPPIN) was constructed by combining the STRING PPI network with lung-specific GRN.
# subsetting PPIs that are present in lung tissue
l_PPI_700 <- subset(PPI_700, (PPI_700$V1 %in% l_p_df$protein_id & PPI_700$V2 %in% l_p_df$protein_id))
#write.table(l_PPI_700, file = "results/string/LSPPIN.txt", sep = "\t", col.names = FALSE, row.names=FALSE) # 210,192 PPIs
l_unique_proteins <- unique(c(l_PPI_700$V1, l_PPI_700$V2)) # 10,188
The resulting LSPPIN contained 210192 PPIs between 10188 unique proteins in the lung tissue.
Ensembl protein identifiers were mapped to Ensembl gene identifiers.
l_dict <- l_p_df$gene_name # 65,742 proteins
names(l_dict) <- l_p_df$protein_id
# This function maps gene names to protein IDs. It takes two arguments: ppi is a dataframe containing IDs for interacting proteins (first two columns: protein1 and protein2) and combined score for their interaction (the third column)
create_lsppi <- function(ppi, dict){
df <- data.frame()
for(i in 1:nrow(ppi)){
cat("Analysing PPI: ", i, "\n")
tryCatch({
p1 <- ppi[i,][1]; p2 <- ppi[i,][2]; comb <- ppi[i,][3]
t <- c(dict[p1$V1], dict[p2$V2], comb$V3)
#cat("PPI is: ", t, "\n")
df <- rbind(df, t)
}, error=function(e){
cat("ERROR: ", conditionMessage(e), "\n")
})
}
df <- distinct(df)
colnames(df) <- c("p1", "p2", "combined_score")
return(df)
}
# Testing
#test_subset <- subset(l_PPI_700, l_PPI_700$V3>998) # 8,410 PPIs
#test_lsppi.df <- create_lsppi(test_subset, l_dict)
# Uncomment the line below
#lsppin.df <- create_lsppi(l_PPI_700, l_dict) # 210,192 PPIs
#write.table(lsppin.df, file = "results/string/LSPPIN.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
lsppin.df <- read.table(gzfile("results/string/LSPPIN.txt.gz"), header = TRUE, sep = "\t")
# getting the number of human PPIs in STRING and LSPPIN (with combined score 0.7).
PPI_700 <- read.table(gzfile("results/string/tab_human_PPIs.full.v11.5_700.txt.gz"),
header = FALSE, sep="\t") # 505,968 PPIs
# removing all characters before and up to "."
PPI_700$V1 <- gsub(".*\\.", "", PPI_700$V1); PPI_700$V2 <- gsub(".*\\.", "", PPI_700$V2)
string_700 <- distinct(PPI_700) # 505,968 PPIs
lsppin_700 <- subset(string_700, (string_700$V1 %in% l_p_df$protein_id &
string_700$V2 %in% l_p_df$protein_id)) # 210,192 PPIs
lung.map.ppis <- data.frame(
ppis = rep(c("LSPPIN","STRING")),
number = c(210192, (505968-210192)),
percentage = c(round((210192/505968)*100, 2),
round(((505968-210192)/505968)*100, 2)))
#pdf("figures/lung_eqtl_map/lung_map_ppis.pdf", width = 9, height = 9)
ggplot(lung.map.ppis, aes(x = "", y = percentage, fill = ppis)) +
geom_bar(width = 1, stat = "identity", fill = c("#009444", "darkgrey"), color = "white") +
coord_polar("y", start = 0)+
#geom_text(aes(y = percentage, label = snps), color = "white")+
#scale_fill_manual(values = mycols) +
theme_void()
#dev.off()
# getting the number per chromosome
string_proteins <- unique(c(string_700$V1, string_700$V2)) # 16,814
lsspin_proteins <- unique(c(lsppin_700$V1, lsppin_700$V2)) # 10,188
lung.map.p <- data.frame(
ppis = rep(c("LSPPIN","STRING")),
number = c(10188, (16814-10188)),
percentage = c(round((10188/16814)*100, 2),
round(((16814-10188)/16814)*100, 2)))
#pdf("figures/lung_eqtl_map/lung_map_proteins.pdf", width = 9, height = 9)
ggplot(lung.map.p, aes(x = "", y = percentage, fill = ppis)) +
geom_bar(width = 1, stat = "identity", fill = c("#009444", "darkgrey"), color = "white") +
coord_polar("y", start = 0) +
#geom_text(aes(y = percentage, label = snps), color = "white")+
#scale_fill_manual(values = mycols) +
theme_void()
#dev.off()
To build COPD-specific LSPPIN, only interactions between COPD-associated genes we extracted from LSPPIN.
# subsetting COPD-associated PPIs that are present in lung tissue
copd_lsppin.df <- subset(lsppin.df, (lsppin.df$p1 %in% sp_lung_egenes
& lsppin.df$p2 %in% sp_lung_egenes))
#write.table(copd_lsppin.df, file = "results/string/COPD_LSPPIN_links.txt", sep = "\t", col.names = FALSE, row.names=FALSE) # 28 PPIs
copd_l_unique_proteins <- unique(c(copd_lsppin.df$p1, copd_lsppin.df$p2)) # 23 proteins
The resulting COPD-associated LSPPIN contained 28 PPIs between 23 unique proteins in the lung.
The Louvain clustering algorithm was applied to identify COPD-specific clusters of functionally related proteins within the LSPPIN.
# loading ASD-specific PPI links and nodes
copd_l_links <- read.table(gzfile("results/string/COPD_LSPPIN_links.txt.gz"),
sep = "\t", header=TRUE)
copd_l_nodes <- read.table(gzfile("results/string/COPD_LSPPIN_nodes.txt.gz"),
sep = "\t", header=TRUE)
## building COPD LSPPIN
copd_graph <- graph_from_data_frame(d=copd_l_links, vertices=copd_l_nodes, directed=F)
# louvain clustering
copd_lc <- cluster_louvain(copd_graph, weights=NA) #class(a_lc)
#membership(a_lc)
communities(copd_lc) # 9 clusters
## $`1`
## [1] "ADAMTSL3" "THSD4"
##
## $`2`
## [1] "BCAR1" "ITGA1"
##
## $`3`
## [1] "CHRNA3" "CHRNA5" "FAM13A" "IREB2"
##
## $`4`
## [1] "DMPK" "SIX5"
##
## $`5`
## [1] "HLA-C" "HLA-DQB1"
##
## $`6`
## [1] "MOCS2" "MSL1" "NUPR1" "SGF29"
##
## $`7`
## [1] "PPP1CB" "PPP1R18" "PPP1R3B"
##
## $`8`
## [1] "SCGB1A1" "SFTPD"
##
## $`9`
## [1] "SSH2" "TESK2"
#col_nodes <- c("#92278F", "#939598")
col_names <- c("#1C75BC", "#ED1C24", "black")
set.seed(1234)
copd <- layout_with_fr(copd_graph)
#pdf("figures/copd_ppi/copd_LSPPIN_with_names.pdf", width = 14, height = 10)
plot(copd_graph, edge.width=2.5, edge.color="lightgray",
vertex.color="#009444", vertex.frame.color="#009444",
vertex.size=V(copd_graph)$logTPM*7, vertex.label.color=col_names[V(copd_graph)$regulation],
layout=copd) # vertex.label=NA
#dev.off()
Pathway analysis was performed using the g:GOSt module of the g:Profiler tool. The significance level was determined using Benjamini-Hochberg algorithm (FDR < 0.05) (7 April 2022). Uncomment the lines below to query g:GOSt.
# This function quieries g:GOSt module of the g:Profiler tool. It takes a vector of genes and quieries the GOSt module. It outputs the dataframe with the query results for the genes.
query_kegg <- function(genes){
tryCatch({
t <- gost(query = genes, organism = "hsapiens", ordered_query = TRUE,
multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
measure_underrepresentation = FALSE, evcodes = TRUE,
user_threshold = 0.05, correction_method = "fdr",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = c("KEGG"), as_short_link = FALSE)
return(t)
}, error=function(e){
cat("ERROR: ", conditionMessage(e), "\n")
})
}
# Pathways analysis for 9 COPD-associated PPIN clusters in the lung
communities(copd_lc) # 9 clusters
## $`1`
## [1] "ADAMTSL3" "THSD4"
##
## $`2`
## [1] "BCAR1" "ITGA1"
##
## $`3`
## [1] "CHRNA3" "CHRNA5" "FAM13A" "IREB2"
##
## $`4`
## [1] "DMPK" "SIX5"
##
## $`5`
## [1] "HLA-C" "HLA-DQB1"
##
## $`6`
## [1] "MOCS2" "MSL1" "NUPR1" "SGF29"
##
## $`7`
## [1] "PPP1CB" "PPP1R18" "PPP1R3B"
##
## $`8`
## [1] "SCGB1A1" "SFTPD"
##
## $`9`
## [1] "SSH2" "TESK2"
# Uncomment the lines below to query g:GOSt
# cluster 1
communities(copd_lc)[1] # to get gene names
## $`1`
## [1] "ADAMTSL3" "THSD4"
c_1 <- communities(copd_lc)$'1' # to get gene names
#c_1_path <- query_kegg(c_1)
#c_1_paths <- c_1_path$result
#c_1.df <- apply(c_1_paths, 2, as.character)
#write.table(c_1.df, file = "results/kegg/copd_LSPPIN_cluster1_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
# cluster 2
communities(copd_lc)[2] # to get gene names
## $`2`
## [1] "BCAR1" "ITGA1"
c_2 <- communities(copd_lc)$'2' # to get gene names
# c_2_path <- query_kegg(c_2)
# c_2_paths <- c_2_path$result
# c_2.df <- apply(c_2_paths, 2, as.character)
# write.table(c_2.df, file = "results/kegg/copd_LSPPIN_cluster2_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
# cluster 3
communities(copd_lc)[3] # to get gene names
## $`3`
## [1] "CHRNA3" "CHRNA5" "FAM13A" "IREB2"
c_3 <- communities(copd_lc)$'3' # to get gene names
# c_3_path <- query_kegg(c_3)
# c_3_paths <- c_3_path$result
# c_3.df <- apply(c_3_paths, 2, as.character)
# write.table(c_3.df, file = "results/kegg/copd_LSPPIN_cluster3_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
# cluster 4
communities(copd_lc)[4] # to get gene names
## $`4`
## [1] "DMPK" "SIX5"
c_4 <- communities(copd_lc)$'4' # to get gene names
#c_4_path <- query_kegg(c_4) # Unknown pathway
# cluster 5
communities(copd_lc)[5] # to get gene names
## $`5`
## [1] "HLA-C" "HLA-DQB1"
c_5 <- communities(copd_lc)$'5' # to get gene names
# c_5_path <- query_kegg(c_5)
# c_5_paths <- c_5_path$result
# c_5.df <- apply(c_5_paths, 2, as.character)
# write.table(c_5.df, file = "results/kegg/copd_LSPPIN_cluster5_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
# cluster 6
communities(copd_lc)[6] # to get gene names
## $`6`
## [1] "MOCS2" "MSL1" "NUPR1" "SGF29"
c_6 <- communities(copd_lc)$'6' # to get gene names
# c_6_path <- query_kegg(c_6)
# c_6_paths <- c_6_path$result
# c_6.df <- apply(c_6_paths, 2, as.character)
# write.table(c_6.df, file = "results/kegg/copd_LSPPIN_cluster6_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
# cluster 7
communities(copd_lc)[7] # to get gene names
## $`7`
## [1] "PPP1CB" "PPP1R18" "PPP1R3B"
c_7 <- communities(copd_lc)$'7' # to get gene names
# c_7_path <- query_kegg(c_7)
# c_7_paths <- c_7_path$result
# c_7.df <- apply(c_7_paths, 2, as.character)
# write.table(c_7.df, file = "results/kegg/copd_LSPPIN_cluster7_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
# cluster 8
communities(copd_lc)[8] # to get gene names
## $`8`
## [1] "SCGB1A1" "SFTPD"
c_8 <- communities(copd_lc)$'8' # to get gene names
#c_8_path <- query_kegg(c_8) # Unknown pathway
# cluster 9
communities(copd_lc)[9] # to get gene names
## $`9`
## [1] "SSH2" "TESK2"
c_9 <- communities(copd_lc)$'9' # to get gene names
# c_9_path <- query_kegg(c_9)
# c_9_paths <- c_9_path$result
# c_9.df <- apply(c_9_paths, 2, as.character)
# write.table(c_9.df, file = "results/kegg/copd_LSPPIN_cluster9_kegg_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
GO analysis was performed using the g:GOSt module of the g:Profiler tool. The significance level was determined using Benjamini-Hochberg algorithm (FDR < 0.05) (7 April 2022).
# This function quieries g:GOSt module of the g:Profiler tool. It takes a vector of genes and quieries the GOSt module. It outputs the dataframe with the query results for the genes.
query_go <- function(genes){
tryCatch({
t <- gost(query = genes, organism = "hsapiens", ordered_query = TRUE,
multi_query = FALSE, significant = TRUE, exclude_iea = FALSE,
measure_underrepresentation = FALSE, evcodes = TRUE,
user_threshold = 0.05, correction_method = "fdr",
domain_scope = "annotated", custom_bg = NULL,
numeric_ns = "", sources = "GO", as_short_link = FALSE)
return(t[["result"]])
}, error=function(e){
cat("ERROR: ", conditionMessage(e), "\n")
})
}
# COPD-associated eGenes
copd_lung_go <- query_go(sp_lung_egenes)
copd_lung_gos.df <- apply(copd_lung_go, 2, as.character)
#write.table(copd_lung_gos.df, file = "results/go/copd_lung_all_gos_fdr.txt", sep = "\t", col.names = TRUE, row.names=FALSE)
copd_lung_gos.df <- read.table(gzfile("results/go/copd_lung_all_gos_fdr.txt.gz"),
header = TRUE, sep="\t")
# Extraction top 10 terms for biological process
copd_lung_go_bp <- copd_lung_gos.df[grep('GO:BP', copd_lung_gos.df[, "source"]), ]
copd_lung_go_bp_top10 <- as.data.frame(copd_lung_go_bp[1:10,])
#pdf("figures/go/copd_lung_bp_gos_fdr_top10.pdf", width = 9, height = 4)
ggplot(copd_lung_go_bp_top10, aes(x=factor(term_name, levels = rev(levels(factor(term_name)))),
y=-log10(as.numeric(p_value)), fill="#009444")) +
geom_bar(stat="identity") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_text(size=16, colour = "black"),
axis.text=element_text(size=16, colour = "black"),
axis.title.y = element_blank(),
legend.position = "none") +
scale_fill_manual(values=c("#009444")) +
labs(y = "-log10(p)") +
geom_hline(aes(yintercept=-log10(as.numeric(0.05))), colour = "red", size = 1) +
coord_flip()
#dev.off()
# Extraction top 10 terms for molecular function
copd_lung_go_mf <- copd_lung_gos.df[grep('GO:MF', copd_lung_gos.df[, "source"]), ]
copd_lung_go_mf_top10 <- as.data.frame(copd_lung_go_mf[1:10,])
#pdf("figures/go/copd_lung_mf_gos_fdr_top10.pdf", width = 13, height = 4)
ggplot(copd_lung_go_mf_top10, aes(x=factor(term_name, levels = rev(levels(factor(term_name)))),
y=-log10(as.numeric(p_value)), fill="#009444")) +
geom_bar(stat="identity") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_text(size=16, colour = "black"),
axis.text=element_text(size=16, colour = "black"),
axis.title.y = element_blank(),
legend.position = "none") +
scale_fill_manual(values=c("#009444")) +
labs(y = "-log10(p)") +
geom_hline(aes(yintercept=-log10(as.numeric(0.05))), colour = "red", size = 1) +
coord_flip()
#dev.off()
# Extraction top 10 terms for cellular component
copd_lung_go_cc <- copd_lung_gos.df[grep('GO:CC', copd_lung_gos.df[, "source"]), ]
copd_lung_go_cc_top10 <- as.data.frame(copd_lung_go_cc[1:10,])
#pdf("figures/go/copd_lung_cc_gos_fdr_top10.pdf", width = 7, height = 4)
ggplot(copd_lung_go_cc_top10, aes(x=factor(term_name, levels = rev(levels(factor(term_name)))),
y=-log10(as.numeric(p_value)), fill="#009444")) +
geom_bar(stat="identity") +
theme_classic() +
theme(plot.title = element_blank(),
axis.title.x = element_text(size=16, colour = "black"),
axis.text=element_text(size=16, colour = "black"),
axis.title.y = element_blank(),
legend.position = "none") +
scale_fill_manual(values=c("#009444")) +
labs(y = "-log10(p)") +
geom_hline(aes(yintercept=-log10(as.numeric(0.05))), colour = "red", size = 1) +
coord_flip()
#dev.off()
copd_lung_string_multm <-read.table(gzfile("results/multimorbid3d/copd/significant_enrichment_bootstrap.txt.gz"), header = TRUE, sep = "\t", quote="", encoding="UTF-8")
#pdf("figures/multimorbidity/copd_lung_string_multimorbidiry.pdf", width = 9, height = 11)
s_gg <- ggplot(copd_lung_string_multm, aes(x = level, y = trait)) +
geom_point(aes(size = trait_eqtls, fill = adj_pval), alpha = 0.75, shape = 21) +
labs( x = "", y = "", size = "Number of eQTLs", fill = "Adj p value") +
theme(legend.key=element_blank(),
axis.text.x = element_text(colour = "black", size = 8, angle = 90,
vjust = 0.3, hjust = 1),
axis.text.y = element_text(colour = "black", size = 8),
legend.text = element_text(size = 8, colour ="black"),
legend.title = element_text(size = 9, face = "bold"),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, size = 1),
legend.position = "right")
s_gg
#dev.off()
cad_lung_string_multm <- read.table(gzfile("results/multimorbid3d/cad/significant_enrichment_bootstrap.txt.gz"), header = TRUE, sep = "\t", quote="", encoding="UTF-8")
ud_lung_string_multm <- read.table(gzfile("results/multimorbid3d/ud/significant_enrichment_bootstrap.txt.gz"), header = TRUE, sep = "\t", quote="", encoding="UTF-8")
#pdf("figures/multimorbidity/cad_lung_string_multimorbidiry.pdf", width = 13, height = 11)
cad_gg <- ggplot(cad_lung_string_multm, aes(x = level, y = trait)) +
geom_point(aes(size = trait_eqtls, fill = adj_pval), alpha = 0.75, shape = 21) +
labs( x = "", y = "", size = "Number of eQTLs", fill = "Adj p value") +
theme(legend.key=element_blank(),
axis.text.x = element_text(colour = "black", size = 8, angle = 90,
vjust = 0.3, hjust = 1),
axis.text.y = element_text(colour = "black", size = 8),
legend.text = element_text(size = 8, colour ="black"),
legend.title = element_text(size = 9, face = "bold"),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, size = 1),
legend.position = "right")
cad_gg
#dev.off()
#pdf("figures/multimorbidity/ud_lung_string_multimorbidiry.pdf", width = 13, height = 11)
ud_gg <- ggplot(ud_lung_string_multm, aes(x = level, y = trait)) +
geom_point(aes(size = trait_eqtls, fill = adj_pval), alpha = 0.75, shape = 21) +
labs( x = "", y = "", size = "Number of eQTLs", fill = "Adj p value") +
theme(legend.key=element_blank(),
axis.text.x = element_text(colour = "black", size = 8, angle = 90,
vjust = 0.3, hjust = 1),
axis.text.y = element_text(colour = "black", size = 8),
legend.text = element_text(size = 8, colour ="black"),
legend.title = element_text(size = 9, face = "bold"),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill = NA, size = 1),
legend.position = "right")
ud_gg
#dev.off()
lsppin.df <- read.table(gzfile("results/string/LSPPIN.txt.gz"), header = TRUE, sep = "\t")
### cluster 1 - sulfur rely system and folate biosynthesis
cl1_l1_1 <- lsppin.df[grep(c('NUPR1'), lsppin.df[, c("p1")]), ] # 4
cl1_l1_2 <- lsppin.df[grep(c('MSL1'), lsppin.df[, c("p1")]), ] # 24
cl1_l1_3 <- lsppin.df[grep(c('SGF29'), lsppin.df[, c("p1")]), ] # 39
cl1_l1_4 <- lsppin.df[grep(c('MOCS2'), lsppin.df[, c("p1")]), ] # 13
cl1_l2_1 <- lsppin.df[grep(c('KAT8'), lsppin.df[, c("p1")]), ] # 41
cl1_l2_2 <- lsppin.df[grep(c('RUVBL1'), lsppin.df[, c("p1")]), ] # 124
cl1_l2_3 <- lsppin.df[grep(c('SUOX'), lsppin.df[, c("p1")]), ] # 40
# heatmaps for levels 0 and 1
colors = colorRamp2(c(-1.5,0,1.5), c("blue", "#E8E8E8","red"), space = "RGB")
p1_l0 <- read.table("results/heatmaps/path1_level0.txt", sep = "\t", header=TRUE, row.names = 1)
## Warning in read.table("results/heatmaps/path1_level0.txt", sep = "\t", header
## = TRUE, : incomplete final line found by readTableHeader on 'results/heatmaps/
## path1_level0.txt'
row_od = c("MSL1", "MOCS2", "NUPR1", "SGF29")
p1_l0.mat <- as.matrix(p1_l0)
ha = rowAnnotation(foo = anno_empty(border = TRUE))
ht = HeatmapAnnotation(top = anno_empty(border = FALSE))
#pdf("figures/heatmaps/p1_l0.pdf", width = 14, height = 10)
Heatmap(p1_l0.mat, name = "log2(aFC)", #column_title = "eQTL SNPs",
col = colors, na_col = "white", #row_title = "shared eGenes",
cluster_rows = FALSE, cluster_columns = FALSE, row_names_side = "left",
column_names_side = "top", row_names_gp = gpar(fontface = "italic"),
row_order = row_od,
width = unit(10, "cm"), height = unit(10, "cm"),
right_annotation = ha, top_annotation = ht,
border = TRUE, rect_gp = gpar(col = "grey"))
decorate_annotation("foo", slice = 1, {
grid.rect(x = 0, width = unit(10, "mm"), gp = gpar(fill = NA, col = NA), just = "left")
grid.text(paste("Level 0", collapse = "\n"), x = unit(5, "mm"), just = "centre", rot = 90,
gp = gpar(fontsize = 13, fontface = "bold"))})
decorate_annotation("top", slice = 1, {
grid.rect(x = 0, width = unit(0.7, "mm"), gp = gpar(fill = NA, col = NA), just = "centre")})
#dev.off()
p1_l1 <- read.table("results/heatmaps/path1_level1.txt", sep = "\t", header=TRUE, row.names = 1)
## Warning in read.table("results/heatmaps/path1_level1.txt", sep = "\t", header
## = TRUE, : incomplete final line found by readTableHeader on 'results/heatmaps/
## path1_level1.txt'
row_od = c("KAT8", "RUVBL1", "SUOX")
p1_l1.mat <- as.matrix(p1_l1)
ha = rowAnnotation(foo = anno_empty(border = TRUE))
ht = HeatmapAnnotation(top = anno_empty(border = FALSE))
#pdf("figures/heatmaps/p1_l1.pdf", width = 14, height = 10)
Heatmap(p1_l1.mat, name = "log2(aFC)", #column_title = "eQTL SNPs",
col = colors, na_col = "white", #row_title = "shared eGenes",
cluster_rows = FALSE, cluster_columns = FALSE, row_names_side = "left",
column_names_side = "top", row_names_gp = gpar(fontface = "italic"),
row_order = row_od,
width = unit(18, "cm"), height = unit(7, "cm"),
right_annotation = ha, top_annotation = ht,
border = TRUE, rect_gp = gpar(col = "grey"))
decorate_annotation("foo", slice = 1, {
grid.rect(x = 0, width = unit(10, "mm"), gp = gpar(fill = NA, col = NA), just = "left")
grid.text(paste("Level 1", collapse = "\n"), x = unit(5, "mm"), just = "centre", rot = 90,
gp = gpar(fontsize = 13, fontface = "bold"))})
decorate_annotation("top", slice = 1, {
grid.rect(x = 0, width = unit(0.7, "mm"), gp = gpar(fill = NA, col = NA), just = "centre")})
#dev.off()
### cluster 2 - axon guidance and regulation of actin cytoskeleton
cl2_l1_1 <- lsppin.df[grep(c('TESK2'), lsppin.df[, c("p1")]), ] # 1
cl2_l1_2 <- lsppin.df[grep(c('SSH2'), lsppin.df[, c("p1")]), ] # 8
cl2_l2_1 <- lsppin.df[grep(c('PPP3CA'), lsppin.df[, c("p1")]), ] # 70
cl2_l2_2 <- lsppin.df[grep(c('TESK2'), lsppin.df[, c("p1")]), ] # 1
cl2_l2_3 <- lsppin.df[grep(c('LIMK1'), lsppin.df[, c("p1")]), ] # 27
cl2_l2_4 <- lsppin.df[grep(c('CFL1'), lsppin.df[, c("p1")]), ] # 39
cl2_l2_5 <- lsppin.df[grep(c('SSH1'), lsppin.df[, c("p1")]), ] # 16
cl2_l2_6 <- lsppin.df[grep(c('SSH3'), lsppin.df[, c("p1")]), ] # 8
cl2_l2_7 <- lsppin.df[grep(c('PPP3CC'), lsppin.df[, c("p1")]), ] # 47
cl2_l2_8 <- lsppin.df[grep(c('CFL2'), lsppin.df[, c("p1")]), ] # 11
# heatmaps for levels 0 and 1
colors = colorRamp2(c(-1.5,0,1.5), c("blue", "#E8E8E8","red"), space = "RGB")
p2_l0 <- read.table("results/heatmaps/path2_level0.txt", sep = "\t", header=TRUE, row.names = 1)
## Warning in read.table("results/heatmaps/path2_level0.txt", sep = "\t", header
## = TRUE, : incomplete final line found by readTableHeader on 'results/heatmaps/
## path2_level0.txt'
row_od = c("SSH2", "TESK2")
p2_l0.mat <- as.matrix(p2_l0)
ha = rowAnnotation(foo = anno_empty(border = TRUE))
ht = HeatmapAnnotation(top = anno_empty(border = FALSE))
#pdf("figures/heatmaps/p2_l0.pdf", width = 14, height = 10)
Heatmap(p2_l0.mat, name = "log2(aFC)", #column_title = "eQTL SNPs",
col = colors, na_col = "white", #row_title = "shared eGenes",
cluster_rows = FALSE, cluster_columns = FALSE, row_names_side = "left",
column_names_side = "top", row_names_gp = gpar(fontface = "italic"),
row_order = row_od,
width = unit(22, "cm"), height = unit(5, "cm"),
right_annotation = ha, top_annotation = ht,
border = TRUE, rect_gp = gpar(col = "grey"))
decorate_annotation("foo", slice = 1, {
grid.rect(x = 0, width = unit(10, "mm"), gp = gpar(fill = NA, col = NA), just = "left")
grid.text(paste("Level 0", collapse = "\n"), x = unit(5, "mm"), just = "centre", rot = 90,
gp = gpar(fontsize = 13, fontface = "bold"))})
decorate_annotation("top", slice = 1, {
grid.rect(x = 0, width = unit(0.7, "mm"), gp = gpar(fill = NA, col = NA), just = "centre")})
#dev.off()
p2_l1 <- read.table("results/heatmaps/path2_level1.txt", sep = "\t", header=TRUE, row.names = 1)
## Warning in read.table("results/heatmaps/path2_level1.txt", sep = "\t", header
## = TRUE, : incomplete final line found by readTableHeader on 'results/heatmaps/
## path2_level1.txt'
row_od = c("BIN1")
p2_l1.mat <- as.matrix(p2_l1)
ha = rowAnnotation(foo = anno_empty(border = TRUE))
ht = HeatmapAnnotation(top = anno_empty(border = FALSE))
#pdf("figures/heatmaps/p2_l1.pdf", width = 14, height = 10)
Heatmap(p2_l1.mat, name = "log2(aFC)", #column_title = "eQTL SNPs",
col = colors, na_col = "white", #row_title = "shared eGenes",
cluster_rows = FALSE, cluster_columns = FALSE, row_names_side = "left",
column_names_side = "top", row_names_gp = gpar(fontface = "italic"),
row_order = row_od,
width = unit(10, "cm"), height = unit(3, "cm"),
right_annotation = ha, top_annotation = ht,
border = TRUE, rect_gp = gpar(col = "grey"))
decorate_annotation("foo", slice = 1, {
grid.rect(x = 0, width = unit(10, "mm"), gp = gpar(fill = NA, col = NA), just = "left")
grid.text(paste("Level 1", collapse = "\n"), x = unit(5, "mm"), just = "centre", rot = 90,
gp = gpar(fontsize = 13, fontface = "bold"))})
decorate_annotation("top", slice = 1, {
grid.rect(x = 0, width = unit(0.7, "mm"), gp = gpar(fill = NA, col = NA), just = "centre")})
#dev.off()